setwd("/Users/aimhighfly/Documents/StonyBrookUniversity/Spring_semester/EST508/Projects/twitter")
library("twitteR")
library("tm")
## Loading required package: NLP
library("wordcloud")
## Loading required package: RColorBrewer
library("cluster")
library("FactoMineR")
library("RColorBrewer")
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library("magrittr")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:twitteR':
##
## id, location
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("googleVis")
##
## Welcome to googleVis version 0.5.10
##
## Please read the Google API Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
##
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
##
## See the googleVis package vignettes for more details,
## or visit http://github.com/mages/googleVis.
##
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))
setup_twitter_oauth("OsgS5oe7vJWrbR68Tvjn9SIiI",
"tzliYNHEKuUsy3wMaRFQPL8uiFKmRP4dKlsGpe6sdYWIzbDKck",
"2833789056-RIryMGYAVdH5at7UD5k7YfDQEqgaEOCFlQrTQhF",
"BCV8ZH590WI44orTFvQ4ukCb1zZazdNjFXkKrPz3fBtlw")
## [1] "Using direct authentication"
trump <- read.csv("realDonaldTrump_tweets.csv", stringsAsFactors = F)
str(trump)
## 'data.frame': 3173 obs. of 5 variables:
## $ id : num 7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
## $ created_at: chr "2016-04-05 03:29:30" "2016-04-05 03:16:32" "2016-04-05 02:00:43" "2016-04-05 01:14:09" ...
## $ text : chr "b'MAKE AMERICA GREAT AGAIN!\\nhttps://t.co/iiXHgM7aA2'" "b'\"@FoxNews: @ScottBaio: \"#DonaldTrump is the only guy, I think, that has the will & the nerve to attack & to fight.\"| __truncated__ "b'\"@vikkideiter: Something VERY close to my heart. I\\'m a NAVY VET! I love @realDonaldTrump\\'s VETERANS ADMINISTRATION REFO"| __truncated__ "b'I will be on @SeanHannity @FoxNews- tonight at 10pmE w/ @MELANIATRUMP, from Wisconsin. Enjoy! #WIPrimary #Trump2016 https://t"| __truncated__ ...
## $ retweet : int 977 1171 1460 1933 6271 2988 3719 3343 1662 2858 ...
## $ favorite : int 2266 2956 4474 5959 12606 9141 10085 9852 6128 9270 ...
#transforming Date and Time into seperate columns
trump["time"] <- NA
#initalizing time column
trump$time <- trump$created_at
#copying data from date-time column
names(trump)[2]<-paste("date")
#renaming first column to date
trump$date <- substr(trump$date, 0, 10)
#keeping date string
trump$time <- substr(trump$time, 12,20)
# add group name
trump <- cbind(trump, "name"="trump")
#keeping time string
trump_most <- trump %>% group_by(date) %>% filter(retweet == max(retweet))
#Summarizing Retweets by Hour of Day example:
trump$time <- substr(trump$time, 1, 2)
trump_t <- trump %>% group_by(time) %>% summarise(sum(retweet))
trump.anno <- gvisAnnotationChart(trump_most,
datevar="date",
numvar="retweet",
idvar="name",
options=list(
width=600, height=350,
fill=10, displayExactValues=TRUE,
colors="['blue']")
)
trump.anno
# create corpus from vector
trump_corpus <- Corpus(VectorSource(trump$text))
trump_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3173
inspect(trump_corpus[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 53
# remove punctuation, lower case, remove numbers, cut out stopwords, strip whitespace
trump_clean <- tm_map(trump_corpus, removePunctuation)
trump_clean <- tm_map(trump_clean, content_transformer(tolower))
trump_clean <- tm_map(trump_clean, removeWords, stopwords("english"))
trump_clean <- tm_map(trump_clean, removeNumbers)
trump_clean <- tm_map(trump_clean, stripWhitespace)
# trump_clean <- tm_map(trump_clean, removeWords, c())
inspect(trump_clean[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 42
wordcloud(trump_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
pos.words = scan("positive-words.txt", what='character', comment.char=';')
neg.words = scan("negative-words.txt", what='character', comment.char=';')
trump_sentiment = score.sentiment(trump$text, pos.words, neg.words)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:twitteR':
##
## id
## Loading required package: stringr
table(trump_sentiment$score)
##
## -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6
## 3 7 26 67 159 374 825 898 496 212 87 14 5
hist(trump_sentiment$score)
boxplot(trump$retweet ~ trump_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Trump")
trump_sentiment_score <- as.factor(trump_sentiment$score)
qplot(trump_sentiment_score, trump$retweet, geom=c("boxplot"), color = trump_sentiment_score,
main="Retweeted count VS Sentiment score of Trump",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(trump_sentiment$score, trump$retweet, geom=c("point", "smooth"),
main="Retweeted count VS Sentiment score of Trump",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(trump_sentiment_score, trump$favorite, geom=c("boxplot"), color = trump_sentiment_score,
main="Favorite count VS Sentiment score of Trump",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(trump_sentiment$score, trump$favorite, geom=c("point", "smooth"),
main="Favorite count VS Sentiment score of Trump",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(trump_sentiment_score, trump$retweet/trump$favorite, geom=c("boxplot"), color = trump_sentiment_score,
main="Retweeted count/Favorite count VS Sentiment score of Trump",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 86 rows containing non-finite values (stat_boxplot).
qplot(trump_sentiment$score, trump$retweet/trump$favorite, geom=c("point", "smooth"),
main="Retweeted count/Favorite count VS Sentiment score of Trump",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 86 rows containing non-finite values (stat_smooth).
hillary <- read.csv("HillaryClinton_tweets.csv", stringsAsFactors = F)
str(hillary)
## 'data.frame': 3218 obs. of 5 variables:
## $ id : num 7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
## $ created_at: chr "2016-04-05 00:39:33" "2016-04-04 23:47:45" "2016-04-04 23:12:51" "2016-04-04 22:26:50" ...
## $ text : chr "b\"How can you stop @GovWalker and the GOP's attacks on education and women's rights?\\n\\nVote: https://t.co/XmpM1irN6v https:"| __truncated__ "b'RT @TheBriefing2016: Hillary has earned the most votes \\xe2\\x9c\\x93\\nWhen more people vote, Hillary wins \\xe2\\x9c\\x93\"| __truncated__ "b'Born on this day in 1928, Maya Angelou\\xe2\\x80\\x99s voice holds a powerful place in the ongoing fight for justice. https:/"| __truncated__ "b'48 years ago, we lost a giant in the fight for equality. Let\\xe2\\x80\\x99s honor Dr. King and keep bending the arc of the m"| __truncated__ ...
## $ retweet : int 481 479 1728 1417 1042 269 7420 888 893 1956 ...
## $ favorite : int 1048 0 3996 3488 2090 544 0 2115 2444 4061 ...
#transforming Date and Time into seperate columns
hillary["time"] <- NA
#initalizing time column
hillary$time <- hillary$created_at
#copying data from date-time column
names(hillary)[2]<-paste("date")
#renaming first column to date
hillary$date <- substr(hillary$date, 0, 10)
#keeping date string
hillary$time <- substr(hillary$time, 12,20)
# add group name
hillary <- cbind(hillary, "name"="hillary")
#keeping time string
hillary_most <- hillary %>% group_by(date) %>% filter(retweet == max(retweet))
#Summarizing Retweets by Hour of Day example:
hillary$time <- substr(hillary$time, 1, 2)
hillary_t <- hillary %>% group_by(time) %>% summarise(sum(retweet))
hillary.anno <- gvisAnnotationChart(hillary_most,
datevar="date",
numvar="retweet",
idvar="name",
options=list(
width=600, height=350,
fill=10, displayExactValues=TRUE,
colors="['red']")
)
hillary.anno
# create corpus
hillary_corpus <- Corpus(VectorSource(hillary$text))
hillary_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3218
inspect(hillary_corpus[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 142
# remove punctuation, lower case, remove numbers, cut out stopwords, strip whitespace
hillary_clean <- tm_map(hillary_corpus, removePunctuation)
hillary_clean <- tm_map(hillary_clean, content_transformer(tolower))
hillary_clean <- tm_map(hillary_clean, removeWords, stopwords("english"))
hillary_clean <- tm_map(hillary_clean, removeNumbers)
hillary_clean <- tm_map(hillary_clean, stripWhitespace)
hillary_clean <- tm_map(hillary_clean, removeWords, c("brt"))
inspect(hillary_clean[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 101
wordcloud(hillary_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
hillary_sentiment = score.sentiment(hillary$text, pos.words, neg.words)
table(hillary_sentiment$score)
##
## -4 -3 -2 -1 0 1 2 3 4 5 6
## 13 29 131 364 1448 838 301 80 11 2 1
hist(hillary_sentiment$score)
boxplot(hillary$retweet ~ hillary_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Clinton")
hillary_sentiment_score <- as.factor(hillary_sentiment$score)
qplot(hillary_sentiment$score, hillary$retweet, geom=c("boxplot"), color = hillary_sentiment_score,
main="Retweeted count VS Sentiment score of Hillary",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(hillary_sentiment$score, hillary$retweet, geom=c("point", "smooth"),
main="Retweeted count VS Sentiment score of Hillary",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(hillary_sentiment$score, hillary$favorite, geom=c("boxplot"), color = hillary_sentiment_score,
main="Favorite count VS Sentiment score of Hillary",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(hillary_sentiment$score, hillary$favorite, geom=c("point", "smooth"),
main="Favorite count VS Sentiment score of Hillary",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(hillary_sentiment$score, hillary$retweet/hillary$favorite, geom=c("boxplot"), color = hillary_sentiment_score,
main="Retweeted count/Favorite count VS Sentiment score of Hillary",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 828 rows containing non-finite values (stat_boxplot).
qplot(hillary_sentiment$score, hillary$retweet/hillary$favorite, geom=c("point", "smooth"),
main="Retweeted count/Favorite count VS Sentiment score of Hillary",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 828 rows containing non-finite values (stat_smooth).
ted <- read.csv("tedcruz_tweets.csv", stringsAsFactors = F)
str(ted)
## 'data.frame': 3216 obs. of 5 variables:
## $ id : num 7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
## $ created_at: chr "2016-04-05 02:39:52" "2016-04-05 02:17:10" "2016-04-05 01:38:02" "2016-04-05 01:34:04" ...
## $ text : chr "b\"Congrats to #CruzCrew's Karen & her husband who won our #NationalChampionship contest! Thank you for the support! https:"| __truncated__ "b'Thank you Waukesha! #ChooseCruz tomorrow: https://t.co/elr0EH0EBs https://t.co/uq14DboB4m'" "b'RT @megynkelly: .@tedcruz on abortion: \\xe2\\x80\\x9cI\\xe2\\x80\\x99m pro-life. I believe that we should protect every huma"| __truncated__ "b'RT @FoxNews: .@TedCruz: \\xe2\\x80\\x9cObamaCare is the biggest job killer in this country. Millions of Americans are hurting"| __truncated__ ...
## $ retweet : int 117 313 466 335 153 173 136 138 487 145 ...
## $ favorite : int 411 596 0 0 0 422 0 0 782 0 ...
#transforming Date and Time into seperate columns
ted["time"] <- NA
#initalizing time column
ted$time <- ted$created_at
#copying data from date-time column
names(ted)[2]<-paste("date")
#renaming first column to date
ted$date <- substr(ted$date, 0, 10)
#keeping date string
ted$time <- substr(ted$time, 12,20)
# add group name
ted <- cbind(ted, "name"="ted")
#keeping time string
ted_most <- ted %>% group_by(date) %>% filter(retweet == max(retweet))
#Summarizing Retweets by Hour of Day example:
ted$time <- substr(ted$time, 1, 2)
ted_t <- ted %>% group_by(time) %>% summarise(sum(retweet))
ted.anno <- gvisAnnotationChart(ted_most,
datevar="date",
numvar="retweet",
idvar="name",
options=list(
width=600, height=350,
fill=10, displayExactValues=TRUE,
colors="['green']")
)
ted.anno
# create corpus
ted_corpus <- Corpus(VectorSource(ted$text))
ted_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3216
inspect(ted_corpus[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 144
# remove punctuation, lower case, remove numbers, cut out stopwords, strip whitespace
ted_clean <- tm_map(ted_corpus, removePunctuation)
ted_clean <- tm_map(ted_clean, content_transformer(tolower))
ted_clean <- tm_map(ted_clean, removeWords, stopwords("english"))
ted_clean <- tm_map(ted_clean, removeNumbers)
ted_clean <- tm_map(ted_clean, stripWhitespace)
ted_clean <- tm_map(ted_clean, removeWords, c("brt"))
inspect(ted_clean[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 102
wordcloud(ted_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
ted_sentiment = score.sentiment(ted$text, pos.words, neg.words)
table(ted_sentiment$score)
##
## -5 -4 -3 -2 -1 0 1 2 3 4 5
## 2 1 13 46 245 1714 894 233 53 13 2
hist(ted_sentiment$score)
boxplot(ted$retweet ~ ted_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Ted")
ted_sentiment_score <- as.factor(ted_sentiment$score)
qplot(ted_sentiment$score, ted$retweet, geom=c("boxplot"), color = ted_sentiment_score,
main="Retweeted count VS Sentiment score of Ted",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(ted_sentiment$score, ted$retweet, geom=c("point", "smooth"),
main="Retweeted count VS Sentiment score of Ted",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(ted_sentiment$score, ted$favorite, geom=c("boxplot"), color = ted_sentiment_score,
main="Favorite count VS Sentiment score of Ted",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(ted_sentiment$score, ted$favorite, geom=c("point", "smooth"),
main="Favorite count VS Sentiment score of Ted",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(ted_sentiment$score, ted$retweet/ted$favorite, geom=c("boxplot"), color = ted_sentiment_score,
main="Retweeted count/Favorite count VS Sentiment score of Ted",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 2038 rows containing non-finite values (stat_boxplot).
qplot(ted_sentiment$score, ted$retweet/ted$favorite, geom=c("point", "smooth"),
main="Retweeted count/Favorite count VS Sentiment score of Ted",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 2038 rows containing non-finite values (stat_smooth).
bernie <- read.csv("BernieSanders_tweets.csv", stringsAsFactors = F)
str(bernie)
## 'data.frame': 3184 obs. of 5 variables:
## $ id : num 7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
## $ created_at: chr "2016-04-05 01:17:04" "2016-04-05 00:25:28" "2016-04-04 23:56:07" "2016-04-04 23:22:33" ...
## $ text : chr "b'\"The major media don\\xe2\\x80\\x99t know how to report on movements.\" \\xe2\\x80\\x94 @RBReich\\nhttps://t.co/86IOLPIkmC'" "b\"Making the American taxpayer pick up BP's bill for cleaning the disastrous Deepwater Horizon spill is an outrage. https://t."| __truncated__ "b\"A victory for workers fighting for collective bargaining rights at Trump's Vegas hotel. Unions make America great.\\nhttps:/"| __truncated__ "b'RT @ariannaijones: \\xe2\\x80\\x9cWhy I endorse Bernie Sanders for President of the United States\\xe2\\x80\\x9d by Barbara L"| __truncated__ ...
## $ retweet : int 580 1286 1067 583 3998 2563 1472 1151 2352 479 ...
## $ favorite : int 1282 2151 2486 0 7032 5921 4064 2801 6200 0 ...
#transforming Date and Time into seperate columns
bernie["time"] <- NA
#initalizing time column
bernie$time <- bernie$created_at
#copying data from date-time column
names(bernie)[2]<-paste("date")
#renaming first column to date
bernie$date <- substr(bernie$date, 0, 10)
#keeping date string
bernie$time <- substr(bernie$time, 12,20)
# add group name
bernie <- cbind(bernie, "name"="bernie")
#keeping time string
bernie_most <- bernie %>% group_by(date) %>% filter(retweet == max(retweet))
#Summarizing Retweets by Hour of Day example:
bernie$time <- substr(bernie$time, 1, 2)
bernie_t <- bernie %>% group_by(time) %>% summarise(sum(retweet))
bernie.anno <- gvisAnnotationChart(bernie_most,
datevar="date",
numvar="retweet",
idvar="name",
options=list(
width=600, height=350,
fill=10, displayExactValues=TRUE,
colors="['yellow']")
)
bernie.anno
# create corpus
bernie_corpus <- Corpus(VectorSource(bernie$text))
bernie_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3184
inspect(ted_corpus[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 144
# remove punctuation, lower case, remove numbers, cut out stopwords, strip whitespace
bernie_clean <- tm_map(bernie_corpus, removePunctuation)
bernie_clean <- tm_map(bernie_clean, content_transformer(tolower))
bernie_clean <- tm_map(bernie_clean, removeWords, stopwords("english"))
bernie_clean <- tm_map(bernie_clean, removeNumbers)
bernie_clean <- tm_map(bernie_clean, stripWhitespace)
bernie_clean <- tm_map(bernie_clean, removeWords, c("brt"))
inspect(bernie_clean[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 77
wordcloud(bernie_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
bernie_sentiment = score.sentiment(bernie$text, pos.words, neg.words)
table(bernie_sentiment$score)
##
## -6 -5 -4 -3 -2 -1 0 1 2 3 4 5
## 1 1 11 37 127 417 1544 755 236 48 6 1
hist(bernie_sentiment$score)
boxplot(bernie$retweet ~ bernie_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Bernie")
bernie_sentiment_score <- as.factor(bernie_sentiment$score)
qplot(bernie_sentiment$score, bernie$retweet, geom=c("boxplot"), color = bernie_sentiment_score,
main="Retweeted count VS Sentiment score of Bernie",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(bernie_sentiment$score, bernie$retweet, geom=c("point", "smooth"),
main="Retweeted count VS Sentiment score of Bernie",
xlab = "Sentiment score", ylab = "Retweeted count")
qplot(bernie_sentiment$score, bernie$favorite, geom=c("boxplot"), color = bernie_sentiment_score,
main="Favorite count VS Sentiment score of Bernie",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(bernie_sentiment$score, bernie$favorite, geom=c("point", "smooth"),
main="Favorite count VS Sentiment score of Bernie",
xlab = "Sentiment score", ylab = "Favorite count")
qplot(bernie_sentiment$score, bernie$retweet/bernie$favorite, geom=c("boxplot"), color = bernie_sentiment_score,
main="Retweeted count/Favorite count VS Sentiment score of Bernie",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 1089 rows containing non-finite values (stat_boxplot).
qplot(bernie_sentiment$score, bernie$retweet/bernie$favorite, geom=c("point", "smooth"),
main="Retweeted count/Favorite count VS Sentiment score of Bernie",
xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 1089 rows containing non-finite values (stat_smooth).
comparison <- rbind(trump, hillary, ted, bernie)
str(comparison)
## 'data.frame': 12791 obs. of 7 variables:
## $ id : num 7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
## $ date : chr "2016-04-05" "2016-04-05" "2016-04-05" "2016-04-05" ...
## $ text : chr "b'MAKE AMERICA GREAT AGAIN!\\nhttps://t.co/iiXHgM7aA2'" "b'\"@FoxNews: @ScottBaio: \"#DonaldTrump is the only guy, I think, that has the will & the nerve to attack & to fight.\"| __truncated__ "b'\"@vikkideiter: Something VERY close to my heart. I\\'m a NAVY VET! I love @realDonaldTrump\\'s VETERANS ADMINISTRATION REFO"| __truncated__ "b'I will be on @SeanHannity @FoxNews- tonight at 10pmE w/ @MELANIATRUMP, from Wisconsin. Enjoy! #WIPrimary #Trump2016 https://t"| __truncated__ ...
## $ retweet : int 977 1171 1460 1933 6271 2988 3719 3343 1662 2858 ...
## $ favorite: int 2266 2956 4474 5959 12606 9141 10085 9852 6128 9270 ...
## $ time : chr "03" "03" "02" "01" ...
## $ name : Factor w/ 4 levels "trump","hillary",..: 1 1 1 1 1 1 1 1 1 1 ...
comparison_most <- rbind(trump_most, hillary_most, ted_most, bernie_most)
comparison.anno <- gvisAnnotationChart(comparison_most,
datevar="date",
numvar="retweet",
idvar="name",
options=list(
width=700, height=400,
fill=10, displayExactValues=TRUE,
colors="['blue','red', 'green', 'yellow']")
)
comparison.anno
par(mfrow=c(2,2))
wordcloud(trump_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
wordcloud(hillary_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
wordcloud(ted_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
wordcloud(bernie_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))